def dependencies():
"""all dependencies called under one function"""
global pd, np, mno, time
global ff,px,go,make_subplots
global KMeans, cdist, StandardScaler
global SVC, PCA, train_test_split
global GridSearchCV, metrics, RandomizedSearchCV
import pandas as pd
import numpy as np
import missingno as mno
import plotly.figure_factory as ff
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import time
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',500)
dependencies()
debug=False # set to False for 0 verbose
from inspect import currentframe, getouterframes
def traces():
"""debugging assist"""
if debug:
print("Stack trace")
print("%15s"%"framename",' : lineno')
frms=getouterframes(currentframe().f_back)
for frm in frms:
print("%15s"%frm.function,' : ',frm.lineno)
if frm.function=='<module>':
break
class dummies:
"""to implement encoding without data leak"""
def __init__(self):
"""input : dataframe"""
self.ref={}
self.fitted=False
def fit(self,df):
"""Collect required encoding information"""
cat=list(df.select_dtypes(include='object').columns)
for col in cat:
unq=list(df[col].value_counts().index)
self.ref.update({col:unq})
self.fitted=True
return
def transform(self,df):
"""perform encoding"""
df=df.copy()
if not self.fitted:
raise ValueError("please fit first")
return
cat=list(self.ref.keys())
for col in cat:
unq=self.ref.get(col)
for i in unq:
df[col+"_"+i]=df[col]
df.loc[df[col+"_"+i]==i,[col+"_"+i]]=1
df.loc[df[col+"_"+i]!=1,[col+"_"+i]]=0
df.drop(col,axis=1,inplace=True)
df = df.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
return df
def fit_transform(self,df):
"""learn and encode"""
self.fit(df)
df=self.transform(df)
return df
def nulsCount(df):
"""summarise missing/unexpected values"""
d2=pd.DataFrame(columns=["NULL","NAN","BLANKS","UNEXP"])
try:
d2["NULL"] = df.isnull().sum().astype('uint32') # check for null values
d2["NAN"]=df.isna().sum().astype('uint32') # check for NaN
d2["BLANKS"]=df.isin([""," "]).sum().astype('uint32') # check for blanks
d2["UNEXP"]=df.isin(["-","?",".","NA","N/A","Unknown"]).sum().astype('uint32') # check for other unexpected values
except:
pass
d2=d2.loc[(d2["NULL"]!=0) | (d2["NAN"]!=0) | (d2["BLANKS"]!=0) | (d2["UNEXP"]!=0)] # shortlist for the missing values
# convert to percentages
d2["NULL %"]=d2["NULL"].apply(lambda x: round(x*100/df.shape[0],2))
d2["NAN %"]=d2["NAN"].apply(lambda x: round(x*100/df.shape[0],2))
d2["BLANKS %"]=d2["BLANKS"].apply(lambda x: round(x*100/df.shape[0],2))
d2["UNEXP %"]=d2["UNEXP"].apply(lambda x: round(x*100/df.shape[0],2))
# rearrange
d2=d2[["NULL","NULL %","NAN","NAN %","BLANKS","BLANKS %","UNEXP","UNEXP %"]]
if d2.shape[0]==0:
return
else:
return d2
def interpolate(df):
"""impute missing values to seamlessly merge with the distribution
does not use mean / mode as imputing value
performs ML based prediction that best fits the overall distribution
ensure to drop any uniqueKey columns to avoid erroneous results
Note: designed for complete datasets with no train test splits
no explicit fit / transform
most suitable for unsupervised learning"""
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import re
from imblearn.over_sampling import RandomOverSampler
traces() # debug info
df=df.copy()
# list features with missing values
nuls = nulsCount(df)
try:
if nuls==None:
print('Nothing to impute\ngood to go')
return
except:
pass
nuls.sort_values(by="NULL",ascending=True,inplace=True)
nul_var = list(nuls.index)
# loop over null listed features, starting from least nuls to higher
for col in nul_var:
traces() # debug info
# Complete case
t1 = df.dropna(axis=0,how='any',subset=[col])
t2 = t1.dropna(axis=1,how='any')
# downcasting
t3 = t2.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
# training dataset separation
X_train = t3.drop(col,axis=1) #1
Y_train = t3[col] #2
train_index = list(t3.index) # indices of training dataset
train_col = list(X_train.columns) # columns of training dataset
source_index = list(df.index) # source indices
# required dataset separation
t4 = df.loc[[i for i in source_index if i not in train_index]]
t5 = t4[[c for c in train_col]]
X_req = t5.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0) #3
# categorical encoding
try:
del dums
except:
pass
dums=dummies()
X_train = dums.fit_transform(X_train)
X_req = dums.transform(X_req) # using same keys as in training dataset
# standardize
scl = StandardScaler()
X_train = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
X_req = pd.DataFrame(scl.transform(X_req),columns=X_req.columns,index=X_req.index)
# interpolate for missing values of numeric type
if re.search('obj',str(Y_train.dtype))==None:
traces() # debug info
# create interpolation model (regression)
lr = LinearRegression()
lr.fit(X_train,Y_train)
# predict & impute
Y_pred = pd.DataFrame(lr.predict(X_req),columns=[col],index=X_req.index)
df.loc[X_req.index,[col]] = Y_pred
# interpolate for missing values of categorical type
elif re.search('obj',str(Y_train.dtype))!=None:
traces() # debug info
# Random over sampling technique
osm = RandomOverSampler(sampling_strategy='minority')
X_train, Y_train = osm.fit_resample(X_train,Y_train)
# target label encoding
li=list(Y_train.unique())
for i,key in enumerate(li):
Y_train.loc[Y_train==key]=i
# create interpolation model (classification)
clf = XGBClassifier(objective='reg:logistic',eval_metric='mlogloss',use_label_encoder=False)
clf.fit(X_train,Y_train)
# predict
Y_pred=clf.predict(X_req)
# label decode
Y_pred=[li[x] for x in Y_pred]
Y_pred=pd.DataFrame(Y_pred,columns=[col],index=X_req.index)
# impute
df.loc[X_req.index,[col]] = Y_pred
# outside loop of null listed features
# downcasting
df = df.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
return df
# delete instance of global variable scoreLog
try:
del scoreLog
print("scoreLog deleted")
except:
print("scoreLog undefined")
# defining a function to report classification metrics
def reporter(Y_train, pred_train, Y_test, pred_test,model_name):
"""Classification report
logs test scores to global dataframe named scoreLog
the scoreLog (with any previous scores) will be displayed
also displays confusion matrices of current instance of arguments
---------------------------------------------------------------------------
Y_train ==> TRUE classes used for training (pandas series object or numpy array of 1-D)
pred_train ==> PREDICTION on training data (pandas series object or numpy array of 1-D)
Y_test ==> TRUE classes to be used for testing (pandas series object or numpy array of 1-D)
pred_test ==> PREDICTION on test data (pandas series object or numpy array of 1-D)
model_name ==> str name for current model, to be used as index for scoreLog
---------------------------------------------------------------------------
"""
from sklearn import metrics
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
global scoreLog
classes=list(Y_test.unique())
cols=["accuracy"]
cols.extend(["precision_"+str(classes[i]) for i in range(len(classes))])
cols.extend(["recall_"+str(classes[i]) for i in range(len(classes))])
cols.extend(["fscore_"+str(classes[i]) for i in range(len(classes))])
try:
type(scoreLog)
except:
scoreLog=pd.DataFrame(columns=cols)
#metrics based on training set
#confusion matrix
z=pd.DataFrame(metrics.confusion_matrix(Y_train, pred_train))
fig1=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_train))),y=list(np.sort(np.unique(Y_train))),
colorscale='Mint',font_colors = ['grey','white'],name="TRAINING SET",
hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
fig1.update_layout(height=350,width=350)
fig1.update_xaxes(title_text="PREDICTED (TRAINING SET) - "+model_name)
fig1.update_yaxes(title_text="TRUE",tickangle=270)
#scores
score=[metrics.accuracy_score(Y_train,pred_train)]
score.extend(metrics.precision_score(Y_train,pred_train,labels=classes,average=None))
score.extend(metrics.recall_score(Y_train,pred_train,labels=classes,average=None))
score.extend(metrics.f1_score(Y_train,pred_train,labels=classes,average=None))
scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_training"]).T)
#metrics based on test set
#confusion matrix
z=pd.DataFrame(metrics.confusion_matrix(Y_test, pred_test))
fig2=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_test))),y=list(np.sort(np.unique(Y_test))),
colorscale='Mint',font_colors = ['grey','white'],name="TEST SET",
hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
fig2.update_layout(height=350,width=350)
fig2.update_xaxes(title_text="PREDICTED (TEST SET) - "+model_name)
fig2.update_yaxes(title_text="TRUE",tickangle=270)
#scores
score=[metrics.accuracy_score(Y_test,pred_test)]
score.extend(metrics.precision_score(Y_test,pred_test,labels=classes,average=None))
score.extend(metrics.recall_score(Y_test,pred_test,labels=classes,average=None))
score.extend(metrics.f1_score(Y_test,pred_test,labels=classes,average=None))
scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_test"]).T)
# merge both confusion matrix heatplots
fig=make_subplots(rows=1,cols=2,horizontal_spacing=0.05)
fig.add_trace(fig1.data[0],row=1,col=1)#,name="training data")
fig.add_trace(fig2.data[0],row=1,col=2)#,name="test data")
annot1 = list(fig1.layout.annotations)
annot2 = list(fig2.layout.annotations)
for k in range(len(annot2)):
annot2[k]['xref'] = 'x2'
annot2[k]['yref'] = 'y2'
fig.update_layout(annotations=annot1+annot2)
fig.layout.xaxis.update(fig1.layout.xaxis)
fig.layout.yaxis.update(fig1.layout.yaxis)
fig.layout.xaxis2.update(fig2.layout.xaxis)
fig.layout.yaxis2.update(fig2.layout.yaxis)
fig.layout.yaxis2.update({'title': {'text': ''}})
display(scoreLog)
fig.show()
scoreLog undefined
DOMAIN: Automobile
CONTEXT: The data concerns city-cycle fuel consumption in miles per gallon to be predicted in terms of 3 multivalued discrete and 5 continuous attributes.
DATA DESCRIPTION:
| Variable | Type |
|---|---|
| cylinders | multi-valued discrete |
| acceleration | continuous |
| displacement | continuous |
| model year | multi-valued discrete |
| horsepower | continuous |
| origin | multi-valued discrete |
| weight | continuous |
| car name | string (unique for each instance) |
| mpg | continuous |
PROJECT OBJECTIVE: To understand K-means Clustering by applying on the Car Dataset to segment the cars into various categories.
STEPS AND TASK:
1. Data Understanding & Exploration:
A. Read ‘Car name.csv’ as a DataFrame and assign it to a variable.
B. Read ‘Car-Attributes.json as a DataFrame and assign it to a variable.
C. Merge both the DataFrames together to form a single DataFrame.
D. Print 5 point summary of the numerical features and share insights.
# read files
cname=pd.read_csv("Car name.csv")
cattr=pd.read_json("Car-Attributes.json")
display(cname.head())
display(cattr.head())
| car_name | |
|---|---|
| 0 | chevrolet chevelle malibu |
| 1 | buick skylark 320 |
| 2 | plymouth satellite |
| 3 | amc rebel sst |
| 4 | ford torino |
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
display(cname.shape,cattr.shape)
(398, 1)
(398, 8)
both datasets could be related by index
#merge dataframes
df=pd.merge(cname,cattr,left_index=True,right_index=True)
df.head() #confirm merger
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
df.shape # verify merger
(398, 9)
df.info() # review information
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
the dataset contains both numeric and object datatypes
# 5 point summary from describe command
df.describe()[3:]
# by default the describe command considers only the numeric features
# alternately df.select_dtypes(exclude='object').describe()[3:] could be used
| mpg | cyl | disp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|
| min | 9.0 | 3.0 | 68.00 | 1613.00 | 8.000 | 70.0 | 1.0 |
| 25% | 17.5 | 4.0 | 104.25 | 2223.75 | 13.825 | 73.0 | 1.0 |
| 50% | 23.0 | 4.0 | 148.50 | 2803.50 | 15.500 | 76.0 | 1.0 |
| 75% | 29.0 | 8.0 | 262.00 | 3608.00 | 17.175 | 79.0 | 2.0 |
| max | 46.6 | 8.0 | 455.00 | 5140.00 | 24.800 | 82.0 | 3.0 |
the fuel efficieny ranges over 9 to 46.6, obviously since displacement, weight & acceleration too varies widely.
also the scales are quite different, hence standardisation will be necessary.
note that hp column is missing in the above 5 point summary, probably due to some unexpected type casting or string values in the feature.
let us review the 5 point summary, after data cleaning once again for better understanding
2. Data Preparation & Analysis:
A. Check and print feature-wise percentage of missing values present in the data and impute with the best suitable approach.
B. Check for duplicate values in the data and impute with the best suitable approach.
H. Check for unexpected values in all the features and datapoints with such values.
display("nulsCount Result:",nulsCount(df))
'nulsCount Result:'
| NULL | NULL % | NAN | NAN % | BLANKS | BLANKS % | UNEXP | UNEXP % | |
|---|---|---|---|---|---|---|---|---|
| hp | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 6 | 1.51 |
There are no NULLs, NANs and blank fields in the dataset
But there are 6 unexpected values in "hp" column
Since the percentage of unexpected values in just 1.51%, it could very well be dropped.
But, for the sake of learning, lets review the hp column and impute the unexpected value accordingly
# filter unexpected value records
df.loc[df["hp"].isin(["-","?",".","NA","N/A","Unknown"])]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 32 | ford pinto | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 | 1 |
| 126 | ford maverick | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 | 1 |
| 330 | renault lecar deluxe | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | 2 |
| 336 | ford mustang cobra | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 | 1 |
| 354 | renault 18i | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 | 2 |
| 374 | amc concord dl | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 | 1 |
there are "?" symbols marking unknown/missing values
Lets replace those with NaN
# lets create a benchmark 5 point summary before imputing
df.loc[df['hp']!='?'].apply(pd.to_numeric,errors='ignore',downcast='float',axis=0).describe()[3:]
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| min | 9.000000 | 3.0 | 68.00 | 46.0 | 1613.00 | 8.000000 | 70.0 | 1.0 |
| 25% | 17.000000 | 4.0 | 105.00 | 75.0 | 2225.25 | 13.775000 | 73.0 | 1.0 |
| 50% | 22.750000 | 4.0 | 151.00 | 93.5 | 2803.50 | 15.500000 | 76.0 | 1.0 |
| 75% | 29.000000 | 8.0 | 275.75 | 126.0 | 3614.75 | 17.025000 | 79.0 | 2.0 |
| max | 46.599998 | 8.0 | 455.00 | 230.0 | 5140.00 | 24.799999 | 82.0 | 3.0 |
# replace unexpected values to NaN
df["hp"].replace(to_replace=["-","?",".","NA","N/A","Unknown"],value=np.nan,inplace=True)
# lets visualise the NaNs in the dataframe
mno.matrix(df, figsize = (20, 6))
<AxesSubplot:>
# impute the nans
df=pd.merge(df['car_name'],interpolate(df.drop('car_name',axis=1)),left_index=True,right_index=True)
# interpolate method is custom built function (code written at start of notebook)
# imputes missing values to seamlessly merge with the distribution
# does not use mean / mode as imputing value
# performs ML based prediction that best fits the overall distribution
# recheck for missing values
display("nulsCount Result:",nulsCount(df))
# visualise if any missing values present in dataframe
mno.matrix(df, figsize = (20, 6))
'nulsCount Result:'
None
<AxesSubplot:>
# lets review the 5 point summary again
df.describe()[3:]
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| min | 9.000000 | 3.0 | 68.00 | 46.0 | 1613.00 | 8.000000 | 70.0 | 1.0 |
| 25% | 17.500000 | 4.0 | 104.25 | 75.0 | 2223.75 | 13.825000 | 73.0 | 1.0 |
| 50% | 23.000000 | 4.0 | 148.50 | 92.5 | 2803.50 | 15.500000 | 76.0 | 1.0 |
| 75% | 29.000000 | 8.0 | 262.00 | 125.0 | 3608.00 | 17.175001 | 79.0 | 2.0 |
| max | 46.599998 | 8.0 | 455.00 | 230.0 | 5140.00 | 24.799999 | 82.0 | 3.0 |
the distribution has not shifted much after imputing
All unexpected values have been imputed succesfully
lets check for duplicates
#check for duplicate rows
df.duplicated().sum()
0
No duplicate records
# lets recheck excluding car names
df.drop('car_name',axis=1).duplicated().sum()
0
No duplicate values in the attributes also
#lets check for duplicates in car_names
df.loc[df['car_name'].duplicated(keep=False)].sort_values(by='car_name')
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 315 | amc concord | 24.299999 | 4.0 | 151.0 | 90.000000 | 3003.0 | 20.100000 | 80.0 | 1.0 |
| 257 | amc concord | 19.400000 | 6.0 | 232.0 | 90.000000 | 3210.0 | 17.200001 | 78.0 | 1.0 |
| 107 | amc gremlin | 18.000000 | 6.0 | 232.0 | 100.000000 | 2789.0 | 15.000000 | 73.0 | 1.0 |
| 169 | amc gremlin | 20.000000 | 6.0 | 232.0 | 100.000000 | 2914.0 | 16.000000 | 75.0 | 1.0 |
| 33 | amc gremlin | 19.000000 | 6.0 | 232.0 | 100.000000 | 2634.0 | 13.000000 | 71.0 | 1.0 |
| 24 | amc gremlin | 21.000000 | 6.0 | 199.0 | 90.000000 | 2648.0 | 15.000000 | 70.0 | 1.0 |
| 99 | amc hornet | 18.000000 | 6.0 | 232.0 | 100.000000 | 2945.0 | 16.000000 | 73.0 | 1.0 |
| 127 | amc hornet | 19.000000 | 6.0 | 232.0 | 100.000000 | 2901.0 | 16.000000 | 74.0 | 1.0 |
| 16 | amc hornet | 18.000000 | 6.0 | 199.0 | 97.000000 | 2774.0 | 15.500000 | 70.0 | 1.0 |
| 194 | amc hornet | 22.500000 | 6.0 | 232.0 | 90.000000 | 3085.0 | 17.600000 | 76.0 | 1.0 |
| 162 | amc matador | 15.000000 | 6.0 | 258.0 | 110.000000 | 3730.0 | 19.000000 | 75.0 | 1.0 |
| 134 | amc matador | 16.000000 | 6.0 | 258.0 | 110.000000 | 3632.0 | 18.000000 | 74.0 | 1.0 |
| 37 | amc matador | 18.000000 | 6.0 | 232.0 | 100.000000 | 3288.0 | 15.500000 | 71.0 | 1.0 |
| 189 | amc matador | 15.500000 | 8.0 | 304.0 | 120.000000 | 3962.0 | 13.900000 | 76.0 | 1.0 |
| 86 | amc matador | 14.000000 | 8.0 | 304.0 | 150.000000 | 3672.0 | 11.500000 | 73.0 | 1.0 |
| 140 | amc matador (sw) | 14.000000 | 8.0 | 304.0 | 150.000000 | 4257.0 | 15.500000 | 74.0 | 1.0 |
| 72 | amc matador (sw) | 15.000000 | 8.0 | 304.0 | 150.000000 | 3892.0 | 12.500000 | 72.0 | 1.0 |
| 177 | audi 100ls | 23.000000 | 4.0 | 115.0 | 95.000000 | 2694.0 | 15.000000 | 75.0 | 2.0 |
| 119 | audi 100ls | 20.000000 | 4.0 | 114.0 | 91.000000 | 2582.0 | 14.000000 | 73.0 | 2.0 |
| 160 | buick century | 17.000000 | 6.0 | 231.0 | 110.000000 | 3907.0 | 21.000000 | 75.0 | 1.0 |
| 363 | buick century | 22.400000 | 6.0 | 231.0 | 110.000000 | 3415.0 | 15.800000 | 81.0 | 1.0 |
| 13 | buick estate wagon (sw) | 14.000000 | 8.0 | 455.0 | 225.000000 | 3086.0 | 10.000000 | 70.0 | 1.0 |
| 289 | buick estate wagon (sw) | 16.900000 | 8.0 | 350.0 | 155.000000 | 4360.0 | 14.900000 | 79.0 | 1.0 |
| 226 | buick skylark | 20.500000 | 6.0 | 231.0 | 105.000000 | 3425.0 | 16.900000 | 77.0 | 1.0 |
| 339 | buick skylark | 26.600000 | 4.0 | 151.0 | 84.000000 | 2635.0 | 16.400000 | 81.0 | 1.0 |
| 221 | chevrolet caprice classic | 17.500000 | 8.0 | 305.0 | 145.000000 | 3880.0 | 12.500000 | 77.0 | 1.0 |
| 285 | chevrolet caprice classic | 17.000000 | 8.0 | 305.0 | 130.000000 | 3840.0 | 15.400000 | 79.0 | 1.0 |
| 91 | chevrolet caprice classic | 13.000000 | 8.0 | 400.0 | 150.000000 | 4464.0 | 12.000000 | 73.0 | 1.0 |
| 0 | chevrolet chevelle malibu | 18.000000 | 8.0 | 307.0 | 130.000000 | 3504.0 | 12.000000 | 70.0 | 1.0 |
| 35 | chevrolet chevelle malibu | 17.000000 | 6.0 | 250.0 | 100.000000 | 3329.0 | 15.500000 | 71.0 | 1.0 |
| 187 | chevrolet chevelle malibu classic | 17.500000 | 8.0 | 305.0 | 140.000000 | 4215.0 | 13.000000 | 76.0 | 1.0 |
| 133 | chevrolet chevelle malibu classic | 16.000000 | 6.0 | 250.0 | 100.000000 | 3781.0 | 17.000000 | 74.0 | 1.0 |
| 266 | chevrolet chevette | 30.000000 | 4.0 | 98.0 | 68.000000 | 2155.0 | 16.500000 | 78.0 | 1.0 |
| 195 | chevrolet chevette | 29.000000 | 4.0 | 85.0 | 52.000000 | 2035.0 | 22.200001 | 76.0 | 1.0 |
| 311 | chevrolet chevette | 32.099998 | 4.0 | 98.0 | 70.000000 | 2120.0 | 15.500000 | 80.0 | 1.0 |
| 237 | chevrolet chevette | 30.500000 | 4.0 | 98.0 | 63.000000 | 2051.0 | 17.000000 | 77.0 | 1.0 |
| 306 | chevrolet citation | 28.799999 | 6.0 | 173.0 | 115.000000 | 2595.0 | 11.300000 | 79.0 | 1.0 |
| 313 | chevrolet citation | 28.000000 | 4.0 | 151.0 | 90.000000 | 2678.0 | 16.500000 | 80.0 | 1.0 |
| 341 | chevrolet citation | 23.500000 | 6.0 | 173.0 | 110.000000 | 2725.0 | 12.600000 | 81.0 | 1.0 |
| 62 | chevrolet impala | 13.000000 | 8.0 | 350.0 | 165.000000 | 4274.0 | 12.000000 | 72.0 | 1.0 |
| 103 | chevrolet impala | 11.000000 | 8.0 | 400.0 | 150.000000 | 4997.0 | 14.000000 | 73.0 | 1.0 |
| 6 | chevrolet impala | 14.000000 | 8.0 | 454.0 | 220.000000 | 4354.0 | 9.000000 | 70.0 | 1.0 |
| 38 | chevrolet impala | 14.000000 | 8.0 | 350.0 | 165.000000 | 4209.0 | 12.000000 | 71.0 | 1.0 |
| 87 | chevrolet malibu | 13.000000 | 8.0 | 350.0 | 145.000000 | 3988.0 | 13.000000 | 73.0 | 1.0 |
| 253 | chevrolet malibu | 20.500000 | 6.0 | 200.0 | 95.000000 | 3155.0 | 18.200001 | 78.0 | 1.0 |
| 262 | chevrolet monte carlo landau | 19.200001 | 8.0 | 305.0 | 145.000000 | 3425.0 | 13.200000 | 78.0 | 1.0 |
| 230 | chevrolet monte carlo landau | 15.500000 | 8.0 | 350.0 | 170.000000 | 4165.0 | 11.400000 | 77.0 | 1.0 |
| 192 | chevrolet nova | 22.000000 | 6.0 | 250.0 | 105.000000 | 3353.0 | 14.500000 | 76.0 | 1.0 |
| 128 | chevrolet nova | 15.000000 | 6.0 | 250.0 | 100.000000 | 3336.0 | 17.000000 | 74.0 | 1.0 |
| 153 | chevrolet nova | 18.000000 | 6.0 | 250.0 | 105.000000 | 3459.0 | 16.000000 | 75.0 | 1.0 |
| 109 | chevrolet vega | 21.000000 | 4.0 | 140.0 | 72.000000 | 2401.0 | 19.500000 | 73.0 | 1.0 |
| 60 | chevrolet vega | 20.000000 | 4.0 | 140.0 | 90.000000 | 2408.0 | 19.500000 | 72.0 | 1.0 |
| 132 | chevrolet vega | 25.000000 | 4.0 | 140.0 | 75.000000 | 2542.0 | 17.000000 | 74.0 | 1.0 |
| 324 | datsun 210 | 40.799999 | 4.0 | 85.0 | 65.000000 | 2110.0 | 19.200001 | 80.0 | 3.0 |
| 303 | datsun 210 | 31.799999 | 4.0 | 85.0 | 65.000000 | 2020.0 | 19.200001 | 79.0 | 3.0 |
| 173 | datsun 710 | 24.000000 | 4.0 | 119.0 | 97.000000 | 2545.0 | 17.000000 | 75.0 | 3.0 |
| 145 | datsun 710 | 32.000000 | 4.0 | 83.0 | 61.000000 | 2003.0 | 19.000000 | 74.0 | 3.0 |
| 18 | datsun pl510 | 27.000000 | 4.0 | 97.0 | 88.000000 | 2130.0 | 14.500000 | 70.0 | 3.0 |
| 29 | datsun pl510 | 27.000000 | 4.0 | 97.0 | 88.000000 | 2130.0 | 14.500000 | 71.0 | 3.0 |
| 316 | dodge aspen | 19.100000 | 6.0 | 225.0 | 90.000000 | 3381.0 | 18.700001 | 80.0 | 1.0 |
| 260 | dodge aspen | 18.600000 | 6.0 | 225.0 | 110.000000 | 3620.0 | 18.700001 | 78.0 | 1.0 |
| 323 | dodge colt | 27.900000 | 4.0 | 156.0 | 105.000000 | 2800.0 | 14.400000 | 80.0 | 1.0 |
| 146 | dodge colt | 28.000000 | 4.0 | 90.0 | 75.000000 | 2125.0 | 14.500000 | 74.0 | 1.0 |
| 185 | dodge colt | 26.000000 | 4.0 | 98.0 | 79.000000 | 2255.0 | 17.700001 | 76.0 | 1.0 |
| 117 | fiat 128 | 29.000000 | 4.0 | 68.0 | 49.000000 | 1867.0 | 19.500000 | 73.0 | 2.0 |
| 147 | fiat 128 | 24.000000 | 4.0 | 90.0 | 75.000000 | 2108.0 | 15.500000 | 74.0 | 2.0 |
| 290 | ford country squire (sw) | 15.500000 | 8.0 | 351.0 | 142.000000 | 4054.0 | 14.300000 | 79.0 | 1.0 |
| 43 | ford country squire (sw) | 13.000000 | 8.0 | 400.0 | 170.000000 | 4746.0 | 12.000000 | 71.0 | 1.0 |
| 65 | ford galaxie 500 | 14.000000 | 8.0 | 351.0 | 153.000000 | 4129.0 | 13.000000 | 72.0 | 1.0 |
| 40 | ford galaxie 500 | 14.000000 | 8.0 | 351.0 | 153.000000 | 4154.0 | 13.500000 | 71.0 | 1.0 |
| 5 | ford galaxie 500 | 15.000000 | 8.0 | 429.0 | 198.000000 | 4341.0 | 10.000000 | 70.0 | 1.0 |
| 88 | ford gran torino | 14.000000 | 8.0 | 302.0 | 137.000000 | 4042.0 | 14.500000 | 73.0 | 1.0 |
| 136 | ford gran torino | 16.000000 | 8.0 | 302.0 | 140.000000 | 4141.0 | 14.000000 | 74.0 | 1.0 |
| 190 | ford gran torino | 14.500000 | 8.0 | 351.0 | 152.000000 | 4215.0 | 12.800000 | 76.0 | 1.0 |
| 74 | ford gran torino (sw) | 13.000000 | 8.0 | 302.0 | 140.000000 | 4294.0 | 16.000000 | 72.0 | 1.0 |
| 139 | ford gran torino (sw) | 14.000000 | 8.0 | 302.0 | 140.000000 | 4638.0 | 16.000000 | 74.0 | 1.0 |
| 92 | ford ltd | 13.000000 | 8.0 | 351.0 | 158.000000 | 4363.0 | 13.000000 | 73.0 | 1.0 |
| 159 | ford ltd | 14.000000 | 8.0 | 351.0 | 148.000000 | 4657.0 | 13.500000 | 75.0 | 1.0 |
| 155 | ford maverick | 15.000000 | 6.0 | 250.0 | 72.000000 | 3158.0 | 19.500000 | 75.0 | 1.0 |
| 126 | ford maverick | 21.000000 | 6.0 | 200.0 | 94.301857 | 2875.0 | 17.000000 | 74.0 | 1.0 |
| 193 | ford maverick | 24.000000 | 6.0 | 200.0 | 81.000000 | 3012.0 | 17.600000 | 76.0 | 1.0 |
| 17 | ford maverick | 21.000000 | 6.0 | 200.0 | 85.000000 | 2587.0 | 16.000000 | 70.0 | 1.0 |
| 100 | ford maverick | 18.000000 | 6.0 | 250.0 | 88.000000 | 3021.0 | 16.500000 | 73.0 | 1.0 |
| 130 | ford pinto | 26.000000 | 4.0 | 122.0 | 80.000000 | 2451.0 | 16.500000 | 74.0 | 1.0 |
| 174 | ford pinto | 18.000000 | 6.0 | 171.0 | 97.000000 | 2984.0 | 14.500000 | 75.0 | 1.0 |
| 112 | ford pinto | 19.000000 | 4.0 | 122.0 | 85.000000 | 2310.0 | 18.500000 | 73.0 | 1.0 |
| 168 | ford pinto | 23.000000 | 4.0 | 140.0 | 83.000000 | 2639.0 | 17.000000 | 75.0 | 1.0 |
| 32 | ford pinto | 25.000000 | 4.0 | 98.0 | 59.251991 | 2046.0 | 19.000000 | 71.0 | 1.0 |
| 206 | ford pinto | 26.500000 | 4.0 | 140.0 | 72.000000 | 2565.0 | 13.600000 | 76.0 | 1.0 |
| 337 | honda accord | 32.400002 | 4.0 | 107.0 | 72.000000 | 2290.0 | 17.000000 | 80.0 | 3.0 |
| 381 | honda accord | 36.000000 | 4.0 | 107.0 | 75.000000 | 2205.0 | 14.500000 | 82.0 | 3.0 |
| 198 | honda civic | 33.000000 | 4.0 | 91.0 | 53.000000 | 1795.0 | 17.400000 | 76.0 | 3.0 |
| 383 | honda civic | 38.000000 | 4.0 | 91.0 | 67.000000 | 1965.0 | 15.000000 | 82.0 | 3.0 |
| 149 | honda civic | 24.000000 | 4.0 | 120.0 | 97.000000 | 2489.0 | 15.000000 | 74.0 | 3.0 |
| 181 | honda civic cvcc | 33.000000 | 4.0 | 91.0 | 53.000000 | 1795.0 | 17.500000 | 75.0 | 3.0 |
| 248 | honda civic cvcc | 36.099998 | 4.0 | 91.0 | 60.000000 | 1800.0 | 16.400000 | 78.0 | 3.0 |
| 358 | mazda 626 | 31.600000 | 4.0 | 120.0 | 74.000000 | 2635.0 | 18.299999 | 81.0 | 3.0 |
| 319 | mazda 626 | 31.299999 | 4.0 | 120.0 | 75.000000 | 2542.0 | 17.500000 | 80.0 | 3.0 |
| 300 | oldsmobile cutlass salon brougham | 23.900000 | 8.0 | 260.0 | 90.000000 | 3420.0 | 22.200001 | 79.0 | 1.0 |
| 249 | oldsmobile cutlass salon brougham | 19.900000 | 8.0 | 260.0 | 110.000000 | 3365.0 | 15.500000 | 78.0 | 1.0 |
| 183 | opel 1900 | 25.000000 | 4.0 | 116.0 | 81.000000 | 2220.0 | 16.900000 | 76.0 | 2.0 |
| 50 | opel 1900 | 28.000000 | 4.0 | 116.0 | 90.000000 | 2123.0 | 14.000000 | 71.0 | 2.0 |
| 143 | opel manta | 26.000000 | 4.0 | 97.0 | 78.000000 | 2300.0 | 14.500000 | 74.0 | 2.0 |
| 118 | opel manta | 24.000000 | 4.0 | 116.0 | 75.000000 | 2158.0 | 15.500000 | 73.0 | 2.0 |
| 20 | peugeot 504 | 25.000000 | 4.0 | 110.0 | 87.000000 | 2672.0 | 17.500000 | 70.0 | 2.0 |
| 178 | peugeot 504 | 23.000000 | 4.0 | 120.0 | 88.000000 | 2957.0 | 17.000000 | 75.0 | 2.0 |
| 299 | peugeot 504 | 27.200001 | 4.0 | 141.0 | 71.000000 | 3190.0 | 24.799999 | 79.0 | 2.0 |
| 209 | peugeot 504 | 19.000000 | 4.0 | 120.0 | 88.000000 | 3270.0 | 21.900000 | 76.0 | 2.0 |
| 125 | plymouth duster | 20.000000 | 6.0 | 198.0 | 95.000000 | 3102.0 | 16.500000 | 74.0 | 1.0 |
| 15 | plymouth duster | 22.000000 | 6.0 | 198.0 | 95.000000 | 2833.0 | 15.500000 | 70.0 | 1.0 |
| 101 | plymouth duster | 23.000000 | 6.0 | 198.0 | 95.000000 | 2904.0 | 16.000000 | 73.0 | 1.0 |
| 7 | plymouth fury iii | 14.000000 | 8.0 | 440.0 | 215.000000 | 4312.0 | 8.500000 | 70.0 | 1.0 |
| 41 | plymouth fury iii | 14.000000 | 8.0 | 318.0 | 150.000000 | 4096.0 | 13.000000 | 71.0 | 1.0 |
| 64 | plymouth fury iii | 15.000000 | 8.0 | 318.0 | 150.000000 | 4135.0 | 13.500000 | 72.0 | 1.0 |
| 338 | plymouth reliant | 27.200001 | 4.0 | 135.0 | 84.000000 | 2490.0 | 15.700000 | 81.0 | 1.0 |
| 342 | plymouth reliant | 30.000000 | 4.0 | 135.0 | 84.000000 | 2385.0 | 12.900000 | 81.0 | 1.0 |
| 97 | plymouth valiant | 18.000000 | 6.0 | 225.0 | 105.000000 | 3121.0 | 16.500000 | 73.0 | 1.0 |
| 191 | plymouth valiant | 22.000000 | 6.0 | 225.0 | 100.000000 | 3233.0 | 15.400000 | 76.0 | 1.0 |
| 63 | pontiac catalina | 14.000000 | 8.0 | 400.0 | 175.000000 | 4385.0 | 12.000000 | 72.0 | 1.0 |
| 8 | pontiac catalina | 14.000000 | 8.0 | 455.0 | 225.000000 | 4425.0 | 10.000000 | 70.0 | 1.0 |
| 156 | pontiac catalina | 16.000000 | 8.0 | 400.0 | 170.000000 | 4668.0 | 11.500000 | 75.0 | 1.0 |
| 372 | pontiac phoenix | 27.000000 | 4.0 | 151.0 | 90.000000 | 2735.0 | 18.000000 | 82.0 | 1.0 |
| 308 | pontiac phoenix | 33.500000 | 4.0 | 151.0 | 90.000000 | 2556.0 | 13.200000 | 79.0 | 1.0 |
| 122 | saab 99le | 24.000000 | 4.0 | 121.0 | 110.000000 | 2660.0 | 14.000000 | 73.0 | 2.0 |
| 180 | saab 99le | 25.000000 | 4.0 | 121.0 | 115.000000 | 2671.0 | 13.500000 | 75.0 | 2.0 |
| 150 | subaru | 26.000000 | 4.0 | 108.0 | 93.000000 | 2391.0 | 15.500000 | 74.0 | 3.0 |
| 346 | subaru | 32.299999 | 4.0 | 97.0 | 67.000000 | 2065.0 | 17.799999 | 81.0 | 3.0 |
| 331 | subaru dl | 33.799999 | 4.0 | 97.0 | 67.000000 | 2145.0 | 18.000000 | 80.0 | 3.0 |
| 239 | subaru dl | 30.000000 | 4.0 | 97.0 | 67.000000 | 1985.0 | 16.400000 | 77.0 | 3.0 |
| 205 | toyota corolla | 28.000000 | 4.0 | 97.0 | 75.000000 | 2155.0 | 16.400000 | 76.0 | 3.0 |
| 167 | toyota corolla | 29.000000 | 4.0 | 97.0 | 75.000000 | 2171.0 | 16.000000 | 75.0 | 3.0 |
| 321 | toyota corolla | 32.200001 | 4.0 | 108.0 | 75.000000 | 2265.0 | 15.200000 | 80.0 | 3.0 |
| 382 | toyota corolla | 34.000000 | 4.0 | 108.0 | 70.000000 | 2245.0 | 16.900000 | 82.0 | 3.0 |
| 356 | toyota corolla | 32.400002 | 4.0 | 108.0 | 75.000000 | 2350.0 | 16.799999 | 81.0 | 3.0 |
| 53 | toyota corolla 1200 | 31.000000 | 4.0 | 71.0 | 65.000000 | 1773.0 | 19.000000 | 71.0 | 3.0 |
| 131 | toyota corolla 1200 | 32.000000 | 4.0 | 71.0 | 65.000000 | 1836.0 | 21.000000 | 74.0 | 3.0 |
| 31 | toyota corona | 25.000000 | 4.0 | 113.0 | 95.000000 | 2228.0 | 14.000000 | 71.0 | 3.0 |
| 171 | toyota corona | 24.000000 | 4.0 | 134.0 | 96.000000 | 2702.0 | 13.500000 | 75.0 | 3.0 |
| 267 | toyota corona | 27.500000 | 4.0 | 134.0 | 95.000000 | 2560.0 | 14.200000 | 78.0 | 3.0 |
| 144 | toyota corona | 31.000000 | 4.0 | 76.0 | 52.000000 | 1649.0 | 16.500000 | 74.0 | 3.0 |
| 123 | toyota mark ii | 20.000000 | 6.0 | 156.0 | 122.000000 | 2807.0 | 13.500000 | 73.0 | 3.0 |
| 210 | toyota mark ii | 19.000000 | 6.0 | 156.0 | 108.000000 | 2930.0 | 15.500000 | 76.0 | 3.0 |
| 172 | volkswagen dasher | 25.000000 | 4.0 | 90.0 | 71.000000 | 2223.0 | 16.500000 | 75.0 | 2.0 |
| 142 | volkswagen dasher | 26.000000 | 4.0 | 79.0 | 67.000000 | 1963.0 | 15.500000 | 74.0 | 2.0 |
| 240 | volkswagen dasher | 30.500000 | 4.0 | 97.0 | 78.000000 | 2190.0 | 14.100000 | 77.0 | 2.0 |
| 203 | volkswagen rabbit | 29.500000 | 4.0 | 97.0 | 71.000000 | 1825.0 | 12.200000 | 76.0 | 2.0 |
| 175 | volkswagen rabbit | 29.000000 | 4.0 | 90.0 | 70.000000 | 1937.0 | 14.000000 | 75.0 | 2.0 |
| 197 | vw rabbit | 29.000000 | 4.0 | 90.0 | 70.000000 | 1937.0 | 14.200000 | 76.0 | 2.0 |
| 309 | vw rabbit | 41.500000 | 4.0 | 98.0 | 76.000000 | 2144.0 | 14.700000 | 80.0 | 2.0 |
Yes, there are repeated car names
lets try to append model year to the car name
# append car_name + 19th century + year + Origin
df['car_name_2']=['19'+str(int(df.loc[i,["yr"]][0]))+' '+df.loc[i,["car_name"]][0] for i in df.index]
df.head() # verify addition
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | car_name_2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8.0 | 307.0 | 130.0 | 3504.0 | 12.0 | 70.0 | 1.0 | 1970 chevrolet chevelle malibu |
| 1 | buick skylark 320 | 15.0 | 8.0 | 350.0 | 165.0 | 3693.0 | 11.5 | 70.0 | 1.0 | 1970 buick skylark 320 |
| 2 | plymouth satellite | 18.0 | 8.0 | 318.0 | 150.0 | 3436.0 | 11.0 | 70.0 | 1.0 | 1970 plymouth satellite |
| 3 | amc rebel sst | 16.0 | 8.0 | 304.0 | 150.0 | 3433.0 | 12.0 | 70.0 | 1.0 | 1970 amc rebel sst |
| 4 | ford torino | 17.0 | 8.0 | 302.0 | 140.0 | 3449.0 | 10.5 | 70.0 | 1.0 | 1970 ford torino |
custom name created
Now lets recheck for duplicates in car_name_2
df.loc[df['car_name_2'].duplicated(keep=False)]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | car_name_2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 168 | ford pinto | 23.000000 | 4.0 | 140.0 | 83.0 | 2639.0 | 17.0 | 75.0 | 1.0 | 1975 ford pinto |
| 174 | ford pinto | 18.000000 | 6.0 | 171.0 | 97.0 | 2984.0 | 14.5 | 75.0 | 1.0 | 1975 ford pinto |
| 338 | plymouth reliant | 27.200001 | 4.0 | 135.0 | 84.0 | 2490.0 | 15.7 | 81.0 | 1.0 | 1981 plymouth reliant |
| 342 | plymouth reliant | 30.000000 | 4.0 | 135.0 | 84.0 | 2385.0 | 12.9 | 81.0 | 1.0 | 1981 plymouth reliant |
Duplicates still available
Lets try to add information about low kerb weight
# find mean weigths by car_name_2
mnwt=df[['wt','car_name_2']].groupby(by=['car_name_2']).mean()
# compare 'wt' with respective mean weight
df['chk']=[df.loc[i,['wt']][0]<mnwt.loc[df.loc[i,['car_name_2']][0]][0] for i in range(df.shape[0])]
# annotate lower kerb weight foun in previous step as "XtraMile" variant
df['car_name_3']=[df.loc[i,['car_name_2']][0]+(' XtraMile' if df.loc[i,['chk']][0] else '') for i in df.index]
# verify variant addition in car_name_3 for the duplicates of car_name_2
df.loc[df['car_name_2'].duplicated(keep=False)]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | car_name_2 | chk | car_name_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 168 | ford pinto | 23.000000 | 4.0 | 140.0 | 83.0 | 2639.0 | 17.0 | 75.0 | 1.0 | 1975 ford pinto | True | 1975 ford pinto XtraMile |
| 174 | ford pinto | 18.000000 | 6.0 | 171.0 | 97.0 | 2984.0 | 14.5 | 75.0 | 1.0 | 1975 ford pinto | False | 1975 ford pinto |
| 338 | plymouth reliant | 27.200001 | 4.0 | 135.0 | 84.0 | 2490.0 | 15.7 | 81.0 | 1.0 | 1981 plymouth reliant | False | 1981 plymouth reliant |
| 342 | plymouth reliant | 30.000000 | 4.0 | 135.0 | 84.0 | 2385.0 | 12.9 | 81.0 | 1.0 | 1981 plymouth reliant | True | 1981 plymouth reliant XtraMile |
succesfully named
if all additional columns are dropped, then there will be no duplicates
# drop addittional columns
df.drop(['car_name','car_name_2','chk'],axis=1,inplace=True)
# rearrange
df=df[['car_name_3', 'mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin']]
# rename
df.columns=['car_name', 'mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin']
lets check for duplicate car_name
df.loc[df['car_name'].duplicated(keep=False)].sort_values(by='car_name')
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin |
|---|
Hence all duplicates imputed succesfully
2. Data Preparation & Analysis:
C. Plot a pairplot for all features.
dims=df.select_dtypes(exclude='object').columns
fig = px.scatter_matrix(df,dimensions=dims,height=800,opacity=0.5)
fig.show()
the following inferences could be made from the above distribution
# the above pairplot misses a diagonal histogram / kde plot
# hence lets visualise the kde plot below
fig = make_subplots(rows=2,cols=4,subplot_titles=dims)
row=1
col=1
for i in dims:
fig2 = ff.create_distplot([list(df[i])],[i],curve_type='kde',show_hist=False,show_rug=False)
fig.add_trace(go.Scatter(fig2.data[0]),row,col)
col+=1
if col==5:
col=1
row=2
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()
the following inferences could be made from the above distribution
2. Data Preparation & Analysis:
D. Visualize a scatterplot for ‘wt’ and ‘disp’. Datapoints should be distinguishable by ‘cyl’.
E. Share insights for Q2.d.
F. Visualize a scatterplot for ‘wt’ and ’mpg’. Datapoints should be distinguishable by ‘cyl’.
G. Share insights for Q2.f.
# to ensure discrete colouring
df['cyl']=df['cyl'].astype(object)
df['origin']=df['origin'].astype(object)
px.scatter(df, x="wt", y="disp",color="cyl",symbol='origin',color_discrete_sequence=px.colors.qualitative.D3)
Inferences:
as weight of the car increases, engine displacement need to be increased to handle the higher loads
higher engine displacement is achieved using more number of cylinders
only American cars (origin 1) has 8 cylinders
only Japanese cars (origin 3) has 3 cylinders
only German cars (origin 2) has 5 cylinders
px.scatter(df, x="wt", y="mpg", color="cyl",symbol='origin',color_discrete_sequence=px.colors.qualitative.D3)
Inferences:
Increased weight drastically impacts mileage, more the weight less the mileage
more number of cylinders could mean lower mileage
yet, 4 cylinder cars have exhibited better mileage than 3 cylinder cars of comparable weights
3. Clustering:
A. Apply K-Means clustering for 2 to 10 clusters.
B. Plot a visual and find elbow point.
C. On the above visual, highlight which are the possible Elbow points.
# downcast numeric attributes to float
df=df.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
# dropping uniqueKey car_name
mdata=df.drop('car_name',axis=1)
# keep aside 3 samples for later testing
test = mdata.sample(3,random_state=129)
# separate the training data
train = mdata.loc[[i for i in list(df.index) if i not in list(test.index)]]
# standardize dataset
scl=StandardScaler()
train_scl=pd.DataFrame(scl.fit_transform(train),columns=train.columns,index=train.index)
clusters=range(2,10)
elbow=pd.DataFrame(columns=['n_clusters','distortion','slope','slope_delta'])
best=pd.DataFrame(columns=['rank','n_clusters','distortion','slope_delta'],index=[1,2])
meanDistortions=[]
# run clustering and measure distortions
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(train_scl)
prediction=model.predict(train_scl)
meanDistortions.append(sum(np.min(cdist(train_scl, model.cluster_centers_, 'euclidean'), axis=1)) / train_scl.shape[0])
# analyse change in distortions
slope=[]
# slope of graph
slope.extend([meanDistortions[i+1]-meanDistortions[i] for i in range(len(meanDistortions)-1)])
slope.append(np.nan)
slope_delta=[np.nan]
# change of slope of graph
slope_delta.extend([slope[i]-slope[i-1] for i in range(1,len(slope)-1)])
slope_delta.append(np.nan)
elbow.n_clusters=clusters
elbow.distortion=meanDistortions
elbow.slope=slope
elbow.slope_delta=slope_delta
display(elbow)
# rank number of cluster based on change of slope
elbow=elbow.sort_values(by=['slope_delta','slope'],ascending=False)
for i in range(1,3):
set1=[]
ind=elbow.index[i-1]
set1.append(i)
set1.append(ind+2)
set1.append(elbow['distortion'].loc[ind])
set1.append(elbow['slope_delta'].loc[ind])
best.loc[i]=set1
print()
display("top 2 options for optimal cluster numbers are",best[['rank','n_clusters']])
| n_clusters | distortion | slope | slope_delta | |
|---|---|---|---|---|
| 0 | 2 | 1.920009 | -0.281050 | NaN |
| 1 | 3 | 1.638959 | -0.154255 | 0.126795 |
| 2 | 4 | 1.484704 | -0.143899 | 0.010356 |
| 3 | 5 | 1.340805 | -0.059291 | 0.084608 |
| 4 | 6 | 1.281515 | -0.062905 | -0.003614 |
| 5 | 7 | 1.218610 | -0.031197 | 0.031707 |
| 6 | 8 | 1.187412 | -0.048009 | -0.016812 |
| 7 | 9 | 1.139403 | NaN | NaN |
'top 2 options for optimal cluster numbers are'
| rank | n_clusters | |
|---|---|---|
| 1 | 1 | 3 |
| 2 | 2 | 5 |
# visualise
gdata=elbow.sort_index().copy()
fig=go.Figure()
# plot optimal cluster numbers
fig.add_trace(go.Scatter(x=best.n_clusters, y=best.distortion,
mode='markers',name='optimal clusters',
marker={'size':15,'color':'#FFA15A'},
text=best['rank'],
hovertemplate='<b>OPTIMA %{text}: %{x} clusters)'))
# plot the distortions for cluster range of 2-9
fig.add_trace(go.Scatter(x=gdata.n_clusters, y=gdata.distortion,name='Distortions',
marker={'color':'#1F77B4'},
hovertemplate='<b>Distortions</b><br>'+
'n_clusters: %{x}<br>'+
'distortion: %{y:.2f}'))
fig.update_xaxes(title_text="n_clusters")
fig.update_yaxes(title_text="distortions")
fig.update_layout(title="Selecting k with the Elbow Method")
fig.show()
from the above graph it is evident that a cluster number of 3 is best
to enhance granularity, lets choose the second best : 5 clusters
3. Clustering:
D. Train a K-means clustering model once again on the optimal number of clusters.
E. Add a new feature in the DataFrame which will have labels based upon cluster value.
F. Plot a visual and color the datapoints based upon clusters.
# 5 clusters
model=KMeans(n_clusters=5)
model.fit(train_scl)
prediction=pd.DataFrame(model.predict(train_scl),columns=["CLUSTER"],index=train_scl.index,dtype='object')
# associate the cluster labes to the dataset in a single dataframe
clust=pd.merge(df['car_name'],train,left_index=True,right_index=True,how='inner')
clust=pd.merge(clust,prediction,left_index=True,right_index=True,how='inner')
# to ensure consistent coloring lets apply color codes manualy
cl=list(clust.CLUSTER.unique())
colmap=['#636EFA','#EF553B','#00CC96','#AB63FA','#FFA15A']
clust["cl_col"]=''
for i,clt in enumerate(cl):
clust.loc[clust['CLUSTER']==clt,["cl_col"]]=colmap[i]
# lets display 1 datapoint from each cluster
cl_ind=[clust.loc[clust['CLUSTER']==i].index[0] for i in cl]
display(clust.loc[cl_ind])
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | CLUSTER | cl_col | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1970 chevrolet chevelle malibu | 18.0 | 8.0 | 307.0 | 130.0 | 3504.0 | 12.0 | 70.0 | 1.0 | 4 | #636EFA |
| 14 | 1970 toyota corona mark ii | 24.0 | 4.0 | 113.0 | 95.0 | 2372.0 | 15.0 | 70.0 | 3.0 | 3 | #EF553B |
| 15 | 1970 plymouth duster | 22.0 | 6.0 | 198.0 | 95.0 | 2833.0 | 15.5 | 70.0 | 1.0 | 1 | #00CC96 |
| 129 | 1974 datsun b210 | 31.0 | 4.0 | 79.0 | 67.0 | 1950.0 | 19.0 | 74.0 | 3.0 | 2 | #AB63FA |
| 168 | 1975 ford pinto XtraMile | 23.0 | 4.0 | 140.0 | 83.0 | 2639.0 | 17.0 | 75.0 | 1.0 | 0 | #FFA15A |
# lets review random datapoints
clust.sample(10)
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | CLUSTER | cl_col | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 350 | 1981 plymouth horizon 4 | 34.700001 | 4.0 | 105.0 | 63.0 | 2215.0 | 14.900000 | 81.0 | 1.0 | 0 | #FFA15A |
| 198 | 1976 honda civic | 33.000000 | 4.0 | 91.0 | 53.0 | 1795.0 | 17.400000 | 76.0 | 3.0 | 2 | #AB63FA |
| 204 | 1976 datsun b-210 | 32.000000 | 4.0 | 85.0 | 70.0 | 1990.0 | 17.000000 | 76.0 | 3.0 | 2 | #AB63FA |
| 295 | 1979 dodge colt hatchback custom | 35.700001 | 4.0 | 98.0 | 80.0 | 1915.0 | 14.400000 | 79.0 | 1.0 | 0 | #FFA15A |
| 33 | 1971 amc gremlin | 19.000000 | 6.0 | 232.0 | 100.0 | 2634.0 | 13.000000 | 71.0 | 1.0 | 1 | #00CC96 |
| 389 | 1982 ford granada l | 22.000000 | 6.0 | 232.0 | 112.0 | 2835.0 | 14.700000 | 82.0 | 1.0 | 1 | #00CC96 |
| 257 | 1978 amc concord | 19.400000 | 6.0 | 232.0 | 90.0 | 3210.0 | 17.200001 | 78.0 | 1.0 | 1 | #00CC96 |
| 19 | 1970 volkswagen 1131 deluxe sedan | 26.000000 | 4.0 | 97.0 | 46.0 | 1835.0 | 20.500000 | 70.0 | 2.0 | 3 | #EF553B |
| 309 | 1980 vw rabbit | 41.500000 | 4.0 | 98.0 | 76.0 | 2144.0 | 14.700000 | 80.0 | 2.0 | 2 | #AB63FA |
| 200 | 1976 ford granada ghia | 18.000000 | 6.0 | 250.0 | 78.0 | 3574.0 | 21.000000 | 76.0 | 1.0 | 1 | #00CC96 |
# visualize cluster lables
cols=clust.select_dtypes(exclude='object').columns
px.scatter_matrix(clust,dimensions=cols,color='CLUSTER',height=800,opacity=0.5)
the above plot give a picture of clustering results
yet to study the cluster reasoning, lets make custom charts as below
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=clust['wt'],y=clust['disp'],z=clust['mpg'], mode='markers', name='',
marker=dict(color=clust['cl_col'],opacity=0.65),text=clust["CLUSTER"],
hovertemplate='<b>CLUSTER %{text}</b><br>'+
'weight:%{x}<br>'+
'displacement:%{y}<br>'+
'mileage:%{z}'))
fig.update_layout(height=1000,title="3D Scatter",scene=dict(xaxis_title='Weight',yaxis_title='Displacement',zaxis_title='Mileage'))
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=clust['origin'],y=clust['yr'],z=clust['mpg'], mode='markers', name='',
marker=dict(color=clust['cl_col'],opacity=0.65),text=clust["CLUSTER"],
hovertemplate='<b>CLUSTER %{text}</b><br>'+
'origin:%{x}<br>'+
'year:%{y}<br>'+
'mileage:%{z}'))
fig.update_layout(height=1000,title="3D Scatter",scene=dict(xaxis_title='Origin',yaxis_title='Year',zaxis_title='Mileage'))
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=clust['wt'],y=clust['yr'],z=clust['mpg'], mode='markers', name='',
marker=dict(color=clust['cl_col'],opacity=0.65),text=clust["CLUSTER"],
hovertemplate='<b>CLUSTER %{text}</b><br>'+
'weight:%{x}<br>'+
'year:%{y}<br>'+
'mileage:%{z}'))
fig.update_layout(height=1000,title="3D Scatter",scene=dict(xaxis_title='Weight',yaxis_title='Year',zaxis_title='Mileage'))
note: refering to cluster colours as cluster numbers could change if retrained, but the clusters remain the same unless data changes and the colours remain the same unless colourmap is changed
It could be seen from above visualisations that PURPLE CLUSTER represents the most efficient cars, built to be light weight and that are creations of technological advancements (like low displacement engines) as years passed by
Interestingly, RED CLUSTER represents the industry's age old attemps to improve mileage by merely keeping the weights low, but has not helped them acheive as good results as the latest improvements in the powertrain technologies as seen in PURPLE CLUSTER
In contrast to PURPLE CLUSTER, the BLUE CLUSTER represents the old age heavy weight cars that had large displacement engine given poor mileage
GREEN CLUSTER represents cars over the entire history, with close to median weights and below median efficiencies
Though ORANGE CLUSTER cars are releatively new age light weight cars, yet their mileage are not up to the range of PURPLE CLUSTER, this could be as a result of missing out on technologies for the sake of reduced capital investments (car costs/technology costs)
also it could be noted that German & Japanese cars have remained to be low weight, and made significant transitions towards improved efficiencies, while American cars slow in catching up
3. Clustering:
G. Pass a new DataPoint and predict which cluster it belongs to.
# lets predict the clusters for the test dataset kept aside
test_scl=pd.DataFrame(scl.transform(test),columns=test.columns,index=test.index) # standard transformation same as training
test_pred=pd.DataFrame(model.predict(test_scl),columns=["CLUSTER"],index=test_scl.index,dtype='object') # prediction
test_clust=pd.merge(df['car_name'],test,left_index=True,right_index=True,how='inner') # merge car_name
test_clust=pd.merge(test_clust,test_pred,left_index=True,right_index=True,how='inner') # merge lables
test_clust['cl_col']=''
for i,clt in enumerate(cl):
test_clust.loc[test_clust['CLUSTER']==clt,["cl_col"]]=colmap[i] # colour encoding as same as model data
display(test_clust) # display results of prediction
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | CLUSTER | cl_col | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1970 buick skylark 320 | 15.0 | 8.0 | 350.0 | 165.0 | 3693.0 | 11.5 | 70.0 | 1.0 | 4 | #636EFA |
| 41 | 1971 plymouth fury iii | 14.0 | 8.0 | 318.0 | 150.0 | 4096.0 | 13.0 | 71.0 | 1.0 | 4 | #636EFA |
| 159 | 1975 ford ltd | 14.0 | 8.0 | 351.0 | 148.0 | 4657.0 | 13.5 | 75.0 | 1.0 | 4 | #636EFA |
the above datapoints have been aptly put in BLUE CLUSTER for they are old aged, poor efficiency cars, primarily of American origin (1)
# lets try to input a new data point, focebly synthesised to sit in PURPLE CLUSTER
datapoint=[45,4,100,75,2000,15,80,3] # new data
datapoint=np.reshape(datapoint,(1,8)) # reshape
new=pd.DataFrame(datapoint,columns=train.columns) # dataframing
new=new.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0) # downcasting
display(new) # witness data
new_scl=pd.DataFrame(scl.transform(new),columns=new.columns) # standard transformation same as training
new_pred=pd.DataFrame(model.predict(new_scl),columns=["CLUSTER"],index=new_scl.index,dtype='object') # prediction
new_clust=pd.merge(new,new_pred,left_index=True,right_index=True,how='inner') # merge lables
new_clust['cl_col']=''
for i,clt in enumerate(cl):
new_clust.loc[new_clust['CLUSTER']==clt,["cl_col"]]=colmap[i] # colour encoding as same as model data
display(new_clust) # display results of prediction
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 45.0 | 4.0 | 100.0 | 75.0 | 2000.0 | 15.0 | 80.0 | 3.0 |
| mpg | cyl | disp | hp | wt | acc | yr | origin | CLUSTER | cl_col | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45.0 | 4.0 | 100.0 | 75.0 | 2000.0 | 15.0 | 80.0 | 3.0 | 2 | #AB63FA |
succesfully our model predicted our intended datapoint in to PURPLE CLUSTER as expected
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=clust['wt'],y=clust['yr'],z=clust['mpg'], mode='markers', name='model',
marker=dict(color=clust['cl_col'],opacity=0.2),text=clust["CLUSTER"],
hovertemplate='<b>CLUSTER %{text}</b><br>'+
'weight:%{x}<br>'+
'year:%{y}<br>'+
'mileage:%{z}'))
fig.add_trace(go.Scatter3d(x=test_clust['wt'],y=test_clust['yr'],z=test_clust['mpg'], mode='markers', name='Test',
marker=dict(color=test_clust['cl_col'],opacity=1,size=12),text=test_clust["CLUSTER"],
hovertemplate='<b><i>TEST DATA</br></i>'+
'CLUSTER %{text}</b><br>'+
'weight:%{x}<br>'+
'year:%{y}<br>'+
'mileage:%{z}'))
fig.add_trace(go.Scatter3d(x=new_clust['wt'],y=new_clust['yr'],z=new_clust['mpg'], mode='markers', name='New',
marker=dict(color=new_clust['cl_col'],opacity=1,size=12),text=new_clust["CLUSTER"],
hovertemplate='<b><i>NEW DATA</br></i>'+
'CLUSTER %{text}</b><br>'+
'weight:%{x}<br>'+
'year:%{y}<br>'+
'mileage:%{z}'))
fig.update_layout(showlegend=False,height=1000,title="3D Scatter",scene=dict(xaxis_title='Weight',yaxis_title='Year',zaxis_title='Mileage'))
DOMAIN: Automobile
CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
DATA DESCRIPTION: The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
• All the features are numeric i.e. geometric features extracted from the silhouette.
PROJECT OBJECTIVE: Apply dimensionality reduction technique – PCA and train a model and compare relative results.
1. Data Understanding & Cleaning:
A. Read ‘vehicle.csv’ and save as DataFrame.
B. Check percentage of missing values and impute with correct approach.
shil=pd.read_csv('vehicle.csv') # read
shil.head() # review
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
# review percentage of missing values
nuls=nulsCount(shil)
display(nuls)
# lets visualise the NaNs in the dataframe
mno.matrix(shil, figsize = (20, 6));
| NULL | NULL % | NAN | NAN % | BLANKS | BLANKS % | UNEXP | UNEXP % | |
|---|---|---|---|---|---|---|---|---|
| circularity | 5 | 0.59 | 5 | 0.59 | 0 | 0.0 | 0 | 0.0 |
| distance_circularity | 4 | 0.47 | 4 | 0.47 | 0 | 0.0 | 0 | 0.0 |
| radius_ratio | 6 | 0.71 | 6 | 0.71 | 0 | 0.0 | 0 | 0.0 |
| pr.axis_aspect_ratio | 2 | 0.24 | 2 | 0.24 | 0 | 0.0 | 0 | 0.0 |
| scatter_ratio | 1 | 0.12 | 1 | 0.12 | 0 | 0.0 | 0 | 0.0 |
| elongatedness | 1 | 0.12 | 1 | 0.12 | 0 | 0.0 | 0 | 0.0 |
| pr.axis_rectangularity | 3 | 0.35 | 3 | 0.35 | 0 | 0.0 | 0 | 0.0 |
| scaled_variance | 3 | 0.35 | 3 | 0.35 | 0 | 0.0 | 0 | 0.0 |
| scaled_variance.1 | 2 | 0.24 | 2 | 0.24 | 0 | 0.0 | 0 | 0.0 |
| scaled_radius_of_gyration | 2 | 0.24 | 2 | 0.24 | 0 | 0.0 | 0 | 0.0 |
| scaled_radius_of_gyration.1 | 4 | 0.47 | 4 | 0.47 | 0 | 0.0 | 0 | 0.0 |
| skewness_about | 6 | 0.71 | 6 | 0.71 | 0 | 0.0 | 0 | 0.0 |
| skewness_about.1 | 1 | 0.12 | 1 | 0.12 | 0 | 0.0 | 0 | 0.0 |
| skewness_about.2 | 1 | 0.12 | 1 | 0.12 | 0 | 0.0 | 0 | 0.0 |
there are less than 1% of missing values in above listed columns
Lets impute them appropriately
# lets create a benchmark 5 point summary before imputing
shil.dropna(axis=0,how='any').apply(pd.to_numeric,errors='ignore',downcast='float',axis=0).describe()[3:]
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| min | 73.0 | 33.0 | 40.0 | 104.0 | 47.0 | 2.0 | 112.0 | 26.0 | 17.0 | 118.0 | 130.0 | 184.0 | 109.0 | 59.0 | 0.0 | 0.0 | 176.0 | 181.0 |
| 25% | 87.0 | 40.0 | 70.0 | 141.0 | 57.0 | 7.0 | 146.0 | 33.0 | 19.0 | 137.0 | 167.0 | 318.0 | 149.0 | 67.0 | 2.0 | 6.0 | 184.0 | 191.0 |
| 50% | 93.0 | 44.0 | 79.0 | 167.0 | 61.0 | 8.0 | 157.0 | 43.0 | 20.0 | 146.0 | 179.0 | 364.0 | 173.0 | 71.0 | 6.0 | 11.0 | 189.0 | 197.0 |
| 75% | 100.0 | 49.0 | 98.0 | 195.0 | 65.0 | 10.0 | 198.0 | 46.0 | 23.0 | 159.0 | 217.0 | 586.0 | 198.0 | 75.0 | 9.0 | 19.0 | 193.0 | 201.0 |
| max | 119.0 | 59.0 | 112.0 | 333.0 | 138.0 | 55.0 | 265.0 | 61.0 | 29.0 | 188.0 | 320.0 | 1018.0 | 268.0 | 135.0 | 22.0 | 41.0 | 206.0 | 211.0 |
# impute the nans
df=interpolate(shil)
# interpolate method is custom built function (code written at start of notebook)
# imputes missing values to seamlessly merge with the distribution
# does not use mean / mode as imputing value
# performs ML based prediction that best fits the overall distribution
# recheck for missing values
display("nulsCount Result:",nulsCount(df))
# visualise if any missing values present in dataframe
mno.matrix(df, figsize = (20, 6));
'nulsCount Result:'
None
# lets review the 5 point summary again post imputing
df.describe()[3:]
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| min | 73.0 | 33.0 | 40.0 | 104.0 | 47.0 | 2.0 | 112.00 | 26.0 | 17.0 | 118.0 | 130.0 | 184.00 | 109.0 | 59.0 | 0.0 | 0.0 | 176.0 | 181.00 |
| 25% | 87.0 | 40.0 | 70.0 | 141.0 | 57.0 | 7.0 | 146.25 | 33.0 | 19.0 | 137.0 | 167.0 | 318.25 | 149.0 | 67.0 | 2.0 | 5.0 | 184.0 | 190.25 |
| 50% | 93.0 | 44.0 | 80.0 | 167.0 | 61.0 | 8.0 | 157.00 | 43.0 | 20.0 | 146.0 | 178.5 | 364.00 | 173.5 | 71.5 | 6.0 | 11.0 | 188.0 | 197.00 |
| 75% | 100.0 | 49.0 | 98.0 | 195.0 | 65.0 | 10.0 | 198.00 | 46.0 | 23.0 | 159.0 | 217.0 | 587.00 | 198.0 | 75.0 | 9.0 | 19.0 | 193.0 | 201.00 |
| max | 119.0 | 59.0 | 112.0 | 333.0 | 138.0 | 55.0 | 265.00 | 61.0 | 29.0 | 188.0 | 320.0 | 1018.00 | 268.0 | 135.0 | 22.0 | 41.0 | 206.0 | 211.00 |
Missing values are successfully imputed without disturbing the data distribution
1. Data Understanding & Cleaning:
C. Visualize a Pie-chart and print percentage of values for variable ‘class’.
fig=px.sunburst(df,path=['class'], # produces pie chart
height=400,hover_name='class:'+df['class'],
color=df['class'],color_discrete_sequence=px.colors.carto.Vivid)
fig.update_layout(title="Distribution of classes")
fig.update_traces(textinfo="label+percent entry",insidetextorientation='tangential')
fig.add_annotation(x=0.5,y=0.52,text='Classes',showarrow=False,font=dict(size=20))
fig.show()
based on problem description and the above plot, one may assume that the dataset contains approximately equal datapoint for bus, van and 2 of the cars
1. Data Understanding & Cleaning:
D. Check for duplicate rows in the data and impute with correct approach.
df.duplicated().sum() # find duplicate rows
0
no duplicate rows found, lets check for duplicates after dropping class column
df.drop('class',axis=1).duplicated().sum()
0
still no duplicates found, hence the dataset is a collection of unique datpoints.
2. Data Preparation:
A. Split data into X and Y. [Train and Test optional]
B. Standardize the Data.
# target & independent variables separation
Y=df['class']
X=df.drop('class',axis=1)
X.info() # review the datatypes
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null float32 1 circularity 846 non-null float32 2 distance_circularity 846 non-null float32 3 radius_ratio 846 non-null float32 4 pr.axis_aspect_ratio 846 non-null float32 5 max.length_aspect_ratio 846 non-null float32 6 scatter_ratio 846 non-null float32 7 elongatedness 846 non-null float32 8 pr.axis_rectangularity 846 non-null float32 9 max.length_rectangularity 846 non-null float32 10 scaled_variance 846 non-null float32 11 scaled_variance.1 846 non-null float32 12 scaled_radius_of_gyration 846 non-null float32 13 scaled_radius_of_gyration.1 846 non-null float32 14 skewness_about 846 non-null float32 15 skewness_about.1 846 non-null float32 16 skewness_about.2 846 non-null float32 17 hollows_ratio 846 non-null float32 dtypes: float32(18) memory usage: 59.6 KB
all attributes are numeric, lets proceed with standardisation
# lets preview the scaling of the original dataset
fig = make_subplots(rows=len(X.columns),cols=1)
for i,col in enumerate(X.columns):
fig.add_trace(go.Box(x=X[col],name=col,hovertemplate='%{x}',jitter=1),i+1,1)
fig.update_layout(height=700,showlegend=False)
fig.show()
the scales of features varies, lower limits ranges from 0 to 200, while upper limits ranges from 30 to 1000
hence scaling is necessary
# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
random_state=129) # random seed
# standardize
scl=StandardScaler()
X_train = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
X_test = pd.DataFrame(scl.transform(X_test),columns=X_test.columns,index=X_test.index) # transform only
# lets preview the scaling of the original dataset
fig = make_subplots(rows=len(X_train.columns),cols=1)
for i,col in enumerate(X_train.columns):
fig.add_trace(go.Box(x=X_train[col],name=col,hovertemplate='%{x}',jitter=1),i+1,1)
fig.update_layout(height=700,showlegend=False)
fig.show()
the features are rescaled between -2 to 3, except for outliers in
max.length_aspect_ratio,
pr.axis_aspect_ratio,
scaled_radius_of_gyration.1 features. Those columns may be need further processing based on model needs
3. Model Building:
A. Train a base Classification model using SVM.
B. Print Classification metrics for train data.
clf = SVC(gamma=0.025, C=3) # define classifier
st1=time.process_time() # timer
clf.fit(X_train , Y_train) # learn
print("time taken : %.6f"%((time.process_time()-st1))) # elapsed time
pred_train=clf.predict(X_train) # predict on training dataset
pred_test=clf.predict(X_test) # predict on testing dataset
reporter(Y_train,pred_train,Y_test,pred_test,"SVC_base") # generate reports (custom-uilt function : code in the begining of notebook)
time taken : 0.015625
| accuracy | precision_car | precision_bus | precision_van | recall_car | recall_bus | recall_van | fscore_car | fscore_bus | fscore_van | |
|---|---|---|---|---|---|---|---|---|---|---|
| SVC_base_training | 0.980769 | 0.985337 | 0.983425 | 0.967532 | 0.988235 | 0.983425 | 0.961290 | 0.986784 | 0.983425 | 0.964401 |
| SVC_base_test | 0.958824 | 0.988506 | 0.944444 | 0.914894 | 0.966292 | 0.918919 | 0.977273 | 0.977273 | 0.931507 | 0.945055 |
3. Model Building:
C. Apply PCA on the data with 10 components.
D. Visualize Cumulative Variance Explained with Number of Components.
E. Draw a horizontal line on the above plot to highlight the threshold of 90%.
F. Apply PCA on the data. This time Select Minimum Components with 90% or above variance explained.
pca = PCA(n_components=10) # define modeler
pca.fit(X_train); # learn
# calculate explained variance cumulative summation
exp_var=np.cumsum(pca.explained_variance_ratio_)
# visualize the above
fig=go.Figure()
# plot optimal cluster numbers
fig.add_trace(go.Scatter(x=list(range(1,len(exp_var)+1)), y=exp_var,
mode='lines+markers',name='',
line={'shape':'hv','color':'#FFA15A'},
hovertemplate='<b>CUMULATIVE VARIENCE EXPLAINED</b><br>'+
'%{x} components explain %{y:.2f} of variance'))
fig.add_trace(go.Scatter(x=list(range(1,len(exp_var)+1)), y=np.ones(10)*0.9,
mode='lines',name='',
line={'shape':'hv','color':'#00CC96'},
hovertemplate='<b>90% threshold line'))
fig.update_xaxes(title_text="n_components")
fig.update_yaxes(title_text="cumulative variance explained")
fig.update_layout(showlegend=False)
fig.add_annotation(x=9.3, y=0.915,text="Threshold @ 90%",showarrow=False)
fig.show()
the above visualsation shows that 5 components will be capable of explaining 90% of the variance in the data
hence lets choose 5 components for further modeling
pca5 = PCA(n_components=5) # define modeler for components with 90% variance explanation
X_train5=pca5.fit_transform(X_train); # learn & transform
X_test5=pca5.transform(X_test); # transform only
3. Model Building:
G. Train SVM model on components selected from above step.
H. Print Classification metrics for train data of above model and share insights.
clf_pca = SVC(gamma=0.025, C=3) # define classifier
st1=time.process_time() # timer
clf_pca.fit(X_train5 , Y_train) # learn
print("time taken : %.6f"%((time.process_time()-st1))) # elapsed time
pred_train5=clf_pca.predict(X_train5) # predict on training dataset
pred_test5=clf_pca.predict(X_test5) # predict on testing dataset
reporter(Y_train,pred_train5,Y_test,pred_test5,"SVC_PCA_5c") # generate reports (custom-uilt function : code in the begining of notebook)
time taken : 0.062500
| accuracy | precision_car | precision_bus | precision_van | recall_car | recall_bus | recall_van | fscore_car | fscore_bus | fscore_van | |
|---|---|---|---|---|---|---|---|---|---|---|
| SVC_base_training | 0.980769 | 0.985337 | 0.983425 | 0.967532 | 0.988235 | 0.983425 | 0.961290 | 0.986784 | 0.983425 | 0.964401 |
| SVC_base_test | 0.958824 | 0.988506 | 0.944444 | 0.914894 | 0.966292 | 0.918919 | 0.977273 | 0.977273 | 0.931507 | 0.945055 |
| SVC_PCA_5c_training | 0.788462 | 0.841962 | 0.758824 | 0.683453 | 0.908824 | 0.712707 | 0.612903 | 0.874116 | 0.735043 | 0.646259 |
| SVC_PCA_5c_test | 0.741176 | 0.790000 | 0.645161 | 0.692308 | 0.887640 | 0.540541 | 0.613636 | 0.835979 | 0.588235 | 0.650602 |
for the given dataset, the SVM on full features had performed better in terms of accuracy and almost every other score compared to the results after PCA
this is direct result of dimentionality reduction that we perform by PCA
also PCA has not reduced any computational times
this is probably due to the fact that the dataset is not huge or does not contain very large features, in order to benefit from the PCA
4. Performance Improvement:
A. Train another SVM on the components out of PCA. Tune the parameters to improve performance.
B. Share best Parameters observed from above step.
C. Print Classification metrics for train data of above model and share relative improvement in performance in all the models along with insights.
# encoding target variables for hypertuning
labels=list(Y_train.unique())
Y_tr_coded = Y_train.copy()
Y_te_coded = Y_test.copy()
for i,lab in enumerate(labels):
Y_tr_coded.loc[Y_tr_coded==lab]=i
Y_te_coded.loc[Y_te_coded==lab]=i
Y_tr_coded=Y_tr_coded.astype(str)
Y_te_coded=Y_te_coded.astype(str)
# Randomised search
clf_pca_tuning=SVC() # plain modeler
p={'C': np.arange(0.01, 5, 0.05), # parameter grid
'gamma': np.arange(1, 10, 0.1),
'kernel': ['linear','rbf']}
rscv=RandomizedSearchCV(estimator=clf_pca_tuning,param_distributions=p,n_iter=20,cv=10,scoring="accuracy")
st1=time.process_time() # timer
rscv.fit(X_train5, Y_tr_coded)
print("time taken : %.6f"%((time.process_time()-st1))) # elapsed time
time taken : 5.265625
# lets print the best parameters of above search
rscv.best_params_
{'kernel': 'rbf', 'gamma': 1.6000000000000005, 'C': 2.11}
log of serveral runs
{'kernel': 'rbf', 'gamma': 1.4000000000000004, 'C': 3.31}
{'kernel': 'rbf', 'gamma': 1.4000000000000004, 'C': 2.81}
{'kernel': 'linear', 'gamma': 8.000000000000007, 'C': 1.36}
{'kernel': 'linear', 'gamma': 7.600000000000006, 'C': 1.9100000000000001}
{'kernel': 'rbf', 'gamma': 1.7000000000000006, 'C': 2.96}
{'kernel': 'rbf', 'gamma': 3.9000000000000026, 'C': 2.46}
{'kernel': 'rbf', 'gamma': 1.8000000000000007, 'C': 4.71}
{'kernel': 'rbf', 'gamma': 1.1, 'C': 0.6100000000000001}
{'kernel': 'rbf', 'gamma': 1.4000000000000004, 'C': 1.4600000000000002}
{'kernel': 'rbf', 'gamma': 1.4000000000000004, 'C': 2.31}
{'kernel': 'rbf', 'gamma': 2.8000000000000016, 'C': 1.76}
based on the above log, lets run a narrowed down gridsearch
# gridsearchcv
clf_pca_tuning=SVC() # plain modeler
p={'C': [1.5,2,2.5,3,3.5,4], # parameter grid
'gamma': [0.025,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3],
'kernel': ['rbf']}
gs = GridSearchCV(estimator=clf_pca_tuning, param_grid=p,scoring='accuracy',n_jobs=-1,cv=10)
st1=time.process_time() # timer
gs.fit(X_train5, Y_tr_coded)
print("time taken : %.6f"%((time.process_time()-st1))) # elapsed time
time taken : 0.718750
# lets print the best parameters of above search
gs.best_params_
{'C': 1.5, 'gamma': 0.5, 'kernel': 'rbf'}
clf_pca_tuned = SVC(gamma=gs.best_params_.get('gamma'),
C=gs.best_params_.get('C'),
kernel=gs.best_params_.get('kernel'))
st1=time.process_time() # timer
clf_pca_tuned.fit(X_train5 , Y_tr_coded)
print("time taken : %.6f"%((time.process_time()-st1))) # elapsed time
pred_train5_tuned=clf_pca_tuned.predict(X_train5) # predict on training dataset
pred_test5_tuned=clf_pca_tuned.predict(X_test5) # predict on testing dataset
pred_train5_tuned_decoded=[labels[int(i)] for i in pred_train5_tuned]
pred_test5_tuned_decoded=[labels[int(i)] for i in pred_test5_tuned]
# generate reports (custom-uilt function : code in the begining of notebook)
reporter(Y_train,pred_train5_tuned_decoded,Y_test,pred_test5_tuned_decoded,"SVC_PCA_5c_tuned")
time taken : 0.015625
| accuracy | precision_car | precision_bus | precision_van | recall_car | recall_bus | recall_van | fscore_car | fscore_bus | fscore_van | |
|---|---|---|---|---|---|---|---|---|---|---|
| SVC_base_training | 0.980769 | 0.985337 | 0.983425 | 0.967532 | 0.988235 | 0.983425 | 0.961290 | 0.986784 | 0.983425 | 0.964401 |
| SVC_base_test | 0.958824 | 0.988506 | 0.944444 | 0.914894 | 0.966292 | 0.918919 | 0.977273 | 0.977273 | 0.931507 | 0.945055 |
| SVC_PCA_5c_training | 0.788462 | 0.841962 | 0.758824 | 0.683453 | 0.908824 | 0.712707 | 0.612903 | 0.874116 | 0.735043 | 0.646259 |
| SVC_PCA_5c_test | 0.741176 | 0.790000 | 0.645161 | 0.692308 | 0.887640 | 0.540541 | 0.613636 | 0.835979 | 0.588235 | 0.650602 |
| SVC_PCA_5c_tuned_training | 0.921598 | 0.950147 | 0.876289 | 0.914894 | 0.952941 | 0.939227 | 0.832258 | 0.951542 | 0.906667 | 0.871622 |
| SVC_PCA_5c_tuned_test | 0.794118 | 0.814433 | 0.714286 | 0.815789 | 0.887640 | 0.675676 | 0.704545 | 0.849462 | 0.694444 | 0.756098 |
it could be found from above scores log that Hypertuning has secured 92% accuracy in training data.
the performance on testing data is quite low at 79%, but has improved from 74% of untuned model
other classifer models might be attempted on getting generalised performance on training & test data
to comment about PCA, yes PCA with model hypertuning can help obtain reasonably good accuracy (or other scores)
but a justified decision based on available cumputational capacities will help
5. Data Understanding & Cleaning:
A. Explain pre-requisite/assumptions of PCA.
B. Explain advantages and limitations of PCA.
the PCA is performed on independant variables considering the following pre-requisites / assumptions
• exlainability of the model built is not of much imoprtance
• there are only linear relations among the attributes
• the features are prescaled to avoid capturing of high scale feature as priority
• loss of accuracy at a miniscule level is acceptable compared to the benefits of computational costs
• skewness in features are properly treated to avoid impact of PCA
Accordingly in our modeling,
we have not focussed on explanation of model fit
relationships among the attributes needs to be studied in detail
features were scaled with z-score using StandardScaler
3 features had skewness, impact of it on the PCA needs to be studied
# visualize relations among independant variables
px.scatter_matrix(X,dimensions=X.columns,height=800,opacity=0.5)
as could ne seen, several feature pairs are exhibiting linear relations, certain pair doesn't show any relation, while few are also showing curvy relations which are non-linear. this could be one major cause of loss of accuracy
Dimensionality reduction (PCA as a feature extraction method) provides the following advantages
• Due to orthoganality of the priciple components, multi collinearity of source data is eliminated
• Due to reduced attributes to learn upon, computational costs of learning a dataset reduces (in terms of processing times)
• Also, since attributes are reduced, lesser memory is sufficient thus further reducing computational costs
• Provides abstract summary in much lesser dimensions (even though unxplainable) enhancing visualisations
• reduction in number of attributes inherently improves the aspect ratio of the dataset, thus avoiding curse of dimensionality
• Since principle components capture more of information and less of noise, over-fitting is avoided
• While PCA brings down the computational costs, it manages to maintain the largest variances in the data, thus not causing much of a information loss
The limitation of PCA could be summarised as follows
• since PCA tries to capture largest variances in the data, unscaled data could lead to erroneous outcomes
• changing the scaling methodology could affect PCA largely
• Since PCA combines information from various attributes, it will not posses meaningful relation to real world, thus looses explainability of the learned model and its coefficients
• skewness in data (with thick tails) can affect PCA's performance
• non-linear relations in the data are not captured by PCA, thhus could cause loss of informtion